import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
%matplotlib inline
df=pd.read_excel("customer_churn_large_dataset.xlsx",sheet_name='Sheet1')
print(df.head())
CustomerID Name Age Gender Location \ 0 1.0 Customer_1 63.0 Male Los Angeles 1 2.0 Customer_2 62.0 Female New York 2 3.0 Customer_3 24.0 Female Los Angeles 3 4.0 Customer_4 36.0 Female Miami 4 5.0 Customer_5 46.0 Female Miami Subscription_Length_Months Monthly_Bill Total_Usage_GB Churn 0 17.0 73.36 236.0 0.0 1 1.0 48.76 172.0 0.0 2 5.0 85.47 460.0 0.0 3 3.0 97.94 297.0 1.0 4 19.0 58.14 266.0 0.0
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 100000 entries, 0 to 99999 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CustomerID 100000 non-null float64 1 Name 100000 non-null object 2 Age 100000 non-null float64 3 Gender 100000 non-null object 4 Location 100000 non-null object 5 Subscription_Length_Months 100000 non-null float64 6 Monthly_Bill 100000 non-null float64 7 Total_Usage_GB 100000 non-null float64 8 Churn 100000 non-null float64 dtypes: float64(6), object(3) memory usage: 6.9+ MB
df.describe().transpose()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| CustomerID | 100000.0 | 50000.500000 | 28867.657797 | 1.0 | 25000.75 | 50000.50 | 75000.25 | 100000.0 |
| Age | 100000.0 | 44.027020 | 15.280283 | 18.0 | 31.00 | 44.00 | 57.00 | 70.0 |
| Subscription_Length_Months | 100000.0 | 12.490100 | 6.926461 | 1.0 | 6.00 | 12.00 | 19.00 | 24.0 |
| Monthly_Bill | 100000.0 | 65.053197 | 20.230696 | 30.0 | 47.54 | 65.01 | 82.64 | 100.0 |
| Total_Usage_GB | 100000.0 | 274.393650 | 130.463063 | 50.0 | 161.00 | 274.00 | 387.00 | 500.0 |
| Churn | 100000.0 | 0.497790 | 0.499998 | 0.0 | 0.00 | 0.00 | 1.00 | 1.0 |
df.isnull().sum()
CustomerID 0 Name 0 Age 0 Gender 0 Location 0 Subscription_Length_Months 0 Monthly_Bill 0 Total_Usage_GB 0 Churn 0 dtype: int64
df=pd.DataFrame(df)
df.head()
| CustomerID | Name | Age | Gender | Location | Subscription_Length_Months | Monthly_Bill | Total_Usage_GB | Churn | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.0 | Customer_1 | 63.0 | Male | Los Angeles | 17.0 | 73.36 | 236.0 | 0.0 |
| 1 | 2.0 | Customer_2 | 62.0 | Female | New York | 1.0 | 48.76 | 172.0 | 0.0 |
| 2 | 3.0 | Customer_3 | 24.0 | Female | Los Angeles | 5.0 | 85.47 | 460.0 | 0.0 |
| 3 | 4.0 | Customer_4 | 36.0 | Female | Miami | 3.0 | 97.94 | 297.0 | 1.0 |
| 4 | 5.0 | Customer_5 | 46.0 | Female | Miami | 19.0 | 58.14 | 266.0 | 0.0 |
df.drop('CustomerID',axis=1,inplace=True)
df.drop('Name',axis=1,inplace=True)
df.head()
| Age | Gender | Location | Subscription_Length_Months | Monthly_Bill | Total_Usage_GB | Churn | |
|---|---|---|---|---|---|---|---|
| 0 | 63.0 | Male | Los Angeles | 17.0 | 73.36 | 236.0 | 0.0 |
| 1 | 62.0 | Female | New York | 1.0 | 48.76 | 172.0 | 0.0 |
| 2 | 24.0 | Female | Los Angeles | 5.0 | 85.47 | 460.0 | 0.0 |
| 3 | 36.0 | Female | Miami | 3.0 | 97.94 | 297.0 | 1.0 |
| 4 | 46.0 | Female | Miami | 19.0 | 58.14 | 266.0 | 0.0 |
df['Gender'].value_counts()
Female 50216 Male 49784 Name: Gender, dtype: int64
df['Churn'].value_counts()
0.0 50221 1.0 49779 Name: Churn, dtype: int64
df['Location'].unique()
array(['Los Angeles', 'New York', 'Miami', 'Chicago', 'Houston'],
dtype=object)
sns.countplot(data=df,x='Churn')
<AxesSubplot:xlabel='Churn', ylabel='count'>
px.histogram(df,x='Gender',y='Age',color='Churn')
px.histogram(df,x='Location',color='Churn')
px.scatter(df,x='Total_Usage_GB',y='Monthly_Bill',color='Gender')
sns.histplot(x='Monthly_Bill', data=df, hue='Churn', kde=True)
<AxesSubplot:xlabel='Monthly_Bill', ylabel='Count'>
px.scatter(df,x='Total_Usage_GB',y='Monthly_Bill',color='Churn')
px.histogram(df,x='Total_Usage_GB',color='Churn')
px.histogram(df,x='Monthly_Bill',color='Churn')
px.scatter(df,x='Monthly_Bill',y='Subscription_Length_Months',color='Churn')
px.histogram(df,x='Subscription_Length_Months',color='Churn')
from sklearn.preprocessing import LabelEncoder
l1=LabelEncoder()
df["Gender"]=l1.fit_transform(df["Gender"])
df["Location"]=l1.fit_transform(df["Location"])
df.head()
| Age | Gender | Location | Subscription_Length_Months | Monthly_Bill | Total_Usage_GB | Churn | |
|---|---|---|---|---|---|---|---|
| 0 | 63.0 | 1 | 2 | 17.0 | 73.36 | 236.0 | 0.0 |
| 1 | 62.0 | 0 | 4 | 1.0 | 48.76 | 172.0 | 0.0 |
| 2 | 24.0 | 0 | 2 | 5.0 | 85.47 | 460.0 | 0.0 |
| 3 | 36.0 | 0 | 3 | 3.0 | 97.94 | 297.0 | 1.0 |
| 4 | 46.0 | 0 | 3 | 19.0 | 58.14 | 266.0 | 0.0 |
df.describe().transpose()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Age | 100000.0 | 44.027020 | 15.280283 | 18.0 | 31.00 | 44.00 | 57.00 | 70.0 |
| Gender | 100000.0 | 0.497840 | 0.499998 | 0.0 | 0.00 | 0.00 | 1.00 | 1.0 |
| Location | 100000.0 | 1.995840 | 1.411638 | 0.0 | 1.00 | 2.00 | 3.00 | 4.0 |
| Subscription_Length_Months | 100000.0 | 12.490100 | 6.926461 | 1.0 | 6.00 | 12.00 | 19.00 | 24.0 |
| Monthly_Bill | 100000.0 | 65.053197 | 20.230696 | 30.0 | 47.54 | 65.01 | 82.64 | 100.0 |
| Total_Usage_GB | 100000.0 | 274.393650 | 130.463063 | 50.0 | 161.00 | 274.00 | 387.00 | 500.0 |
| Churn | 100000.0 | 0.497790 | 0.499998 | 0.0 | 0.00 | 0.00 | 1.00 | 1.0 |
global_mean=df['Churn'].mean()
round(global_mean,2)
0.5
mean=df.groupby('Age').Churn.mean()
risk=mean/global_mean
risk
Age 18.0 1.025741 19.0 0.965628 20.0 0.993788 21.0 0.983790 22.0 0.978022 23.0 0.987938 24.0 0.949827 25.0 1.009271 26.0 0.990504 27.0 1.050381 28.0 1.013517 29.0 0.987921 30.0 0.969619 31.0 1.020819 32.0 1.036626 33.0 1.030950 34.0 0.965872 35.0 1.021316 36.0 1.038479 37.0 0.978657 38.0 1.008519 39.0 0.992223 40.0 0.973905 41.0 1.058832 42.0 0.982581 43.0 1.030858 44.0 0.993955 45.0 0.992894 46.0 0.994207 47.0 1.003896 48.0 0.990128 49.0 0.965432 50.0 0.949085 51.0 1.002878 52.0 0.998933 53.0 1.025111 54.0 1.015035 55.0 1.006554 56.0 0.944345 57.0 1.009612 58.0 1.003369 59.0 1.053411 60.0 0.999164 61.0 0.984972 62.0 1.018966 63.0 0.957890 64.0 0.983786 65.0 1.003892 66.0 1.032660 67.0 1.012261 68.0 1.011449 69.0 1.012845 70.0 0.985093 Name: Churn, dtype: float64
mean=df.groupby('Location').Churn.mean()
risk=mean/global_mean
risk
Location 0 1.001017 1 0.986550 2 0.990356 3 1.010507 4 1.011791 Name: Churn, dtype: float64
mean=df.groupby('Subscription_Length_Months').Churn.mean()
risk=mean/global_mean
risk
Subscription_Length_Months 1.0 1.011298 2.0 1.002064 3.0 1.013182 4.0 0.989243 5.0 0.970966 6.0 0.979473 7.0 1.008972 8.0 0.998079 9.0 0.989861 10.0 1.005166 11.0 1.000613 12.0 1.008066 13.0 1.018948 14.0 0.988466 15.0 0.979097 16.0 1.011327 17.0 0.999002 18.0 0.997938 19.0 1.013735 20.0 0.996737 21.0 0.985579 22.0 1.007029 23.0 1.007638 24.0 1.017383 Name: Churn, dtype: float64
mean=df.groupby('Monthly_Bill').Churn.mean()
risk=mean/global_mean
risk
Monthly_Bill
30.00 0.803552
30.01 1.063524
30.02 0.860948
30.03 1.147931
30.04 1.129995
...
99.96 1.187065
99.97 0.547876
99.98 1.374496
99.99 1.116044
100.00 0.669626
Name: Churn, Length: 7001, dtype: float64
mean=df.groupby('Total_Usage_GB').Churn.mean()
risk=mean/global_mean
risk
Total_Usage_GB
50.0 0.960768
51.0 0.999679
52.0 1.163035
53.0 1.078843
54.0 0.994688
...
496.0 0.936876
497.0 0.977046
498.0 0.989883
499.0 1.021464
500.0 0.869148
Name: Churn, Length: 451, dtype: float64
mean=df.groupby('Gender').Churn.mean()
risk=mean/global_mean
risk
Gender 0 0.997879 1 1.002140 Name: Churn, dtype: float64
df.drop('Gender',axis=1,inplace=True)
df.drop('Subscription_Length_Months',axis=1,inplace=True)
df.drop('Location',axis=1,inplace=True)
df.drop('Age',axis=1,inplace=True)
df.head()
| Monthly_Bill | Total_Usage_GB | Churn | |
|---|---|---|---|
| 0 | 73.36 | 236.0 | 0.0 |
| 1 | 48.76 | 172.0 | 0.0 |
| 2 | 85.47 | 460.0 | 0.0 |
| 3 | 97.94 | 297.0 | 1.0 |
| 4 | 58.14 | 266.0 | 0.0 |
x=df.drop(["Churn"],axis=1).values
y=df["Churn"]
x
array([[ 73.36, 236. ],
[ 48.76, 172. ],
[ 85.47, 460. ],
...,
[ 96.11, 251. ],
[ 49.25, 434. ],
[ 76.57, 173. ]])
y
0 0.0
1 0.0
2 0.0
3 1.0
4 0.0
...
99995 1.0
99996 0.0
99997 1.0
99998 1.0
99999 1.0
Name: Churn, Length: 100000, dtype: float64
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
sc = StandardScaler()
X = sc.fit_transform(x)
x_train , x_test , y_train , y_test = train_test_split(X , y , test_size=0.25,random_state=101)
x_train.shape, x_test.shape
((75000, 2), (25000, 2))
lr = LogisticRegression()
lr.fit(x_train,y_train)
LogisticRegression()
prediction=lr.predict(x_test)
print(confusion_matrix(y_test,prediction))
print(classification_report(y_test,prediction))
[[9692 2909]
[9740 2659]]
precision recall f1-score support
0.0 0.50 0.77 0.61 12601
1.0 0.48 0.21 0.30 12399
accuracy 0.49 25000
macro avg 0.49 0.49 0.45 25000
weighted avg 0.49 0.49 0.45 25000
print("Logistic Regression accuray is {:.2f}%" .format(lr.score(x_test , y_test)*100) )
Logistic Regression accuray is 49.40%
from sklearn.neighbors import KNeighborsClassifier
error_rate=[]
for i in range (1,50):
knn=KNeighborsClassifier(n_neighbors=i)
knn.fit(x_train,y_train)
pred=knn.predict(x_test)
error_rate.append(np.mean(pred != y_test))
plt.figure(figsize=(14,6))
plt.plot(range(1,50),error_rate,color='blue',marker='o',markerfacecolor='red',markersize=10)
plt.title('Error Rate Vs K Value')
plt.ylabel('Error Rate')
plt.xlabel('K Value')
Text(0.5, 0, 'K Value')
knn=KNeighborsClassifier(n_neighbors=28)
knn.fit(x_train,y_train)
pred=knn.predict(x_test)
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))
[[7490 5111]
[7181 5218]]
precision recall f1-score support
0.0 0.51 0.59 0.55 12601
1.0 0.51 0.42 0.46 12399
accuracy 0.51 25000
macro avg 0.51 0.51 0.50 25000
weighted avg 0.51 0.51 0.50 25000
print("The accuracy of the KNN Model is {:.2f}%".format(knn.score(x_test,y_test)*100))
The accuracy of the KNN Model is 50.83%
from sklearn.svm import SVC
svm=SVC()
svm.fit(x_train,y_train)
svm.fit(x_train,y_train)
prediction=svm.predict(x_test)
print(confusion_matrix(y_test,prediction))
print(classification_report(y_test,prediction))
[[8092 4509]
[8055 4344]]
precision recall f1-score support
0.0 0.50 0.64 0.56 12601
1.0 0.49 0.35 0.41 12399
accuracy 0.50 25000
macro avg 0.50 0.50 0.49 25000
weighted avg 0.50 0.50 0.49 25000
print("The accuracy of the Support Vector Machine Model is {:.2f}%".format(svm.score(x_test,y_test)*100))
The accuracy of the Support Vector Machine Model is 49.74%
from sklearn.tree import DecisionTreeClassifier
dtree= DecisionTreeClassifier(criterion='entropy')
dtree.fit(x_train,y_train)
prediction=dtree.predict(x_test)
print(confusion_matrix(y_test,prediction))
print(classification_report(y_test,prediction))
[[6392 6209]
[6197 6202]]
precision recall f1-score support
0.0 0.51 0.51 0.51 12601
1.0 0.50 0.50 0.50 12399
accuracy 0.50 25000
macro avg 0.50 0.50 0.50 25000
weighted avg 0.50 0.50 0.50 25000
print("The accuracy of the Decicion Tree Model is {:.2f}%".format(dtree.score(x_test,y_test)*100))
The accuracy of the Decicion Tree Model is 50.38%
from sklearn.model_selection import GridSearchCV, StratifiedKFold
lr = LogisticRegression(solver='newton-cg')
weights = np.linspace(0.0,0.99,200)
param_grid = {'class_weight': [{0:x, 1:1.0-x} for x in weights]}
gridsearch = GridSearchCV(estimator= lr,
param_grid= param_grid,
cv=StratifiedKFold(),
n_jobs=-1,
scoring='f1',
verbose=2).fit(x_train, y_train)
sns.set_style('whitegrid')
plt.figure(figsize=(12,8))
weigh_data = pd.DataFrame({ 'score': gridsearch.cv_results_['mean_test_score'], 'weight': (1- weights)})
sns.lineplot(weigh_data['weight'], weigh_data['score'])
plt.xlabel('Weight for class 1')
plt.ylabel('F1 score')
plt.xticks([round(i/10,1) for i in range(0,11,1)])
plt.title('Scoring for different class weights', fontsize=24)
Fitting 5 folds for each of 200 candidates, totalling 1000 fits
D:\anaconda\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
Text(0.5, 1.0, 'Scoring for different class weights')
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver='newton-cg', class_weight={0: 0.4, 1: 0.6})
lr.fit(x_train, y_train)
pred_test = lr.predict(x_test)
print(confusion_matrix(y_test,pred_test))
print(classification_report(y_test,pred_test))
[[ 0 12601]
[ 0 12399]]
precision recall f1-score support
0.0 0.00 0.00 0.00 12601
1.0 0.50 1.00 0.66 12399
accuracy 0.50 25000
macro avg 0.25 0.50 0.33 25000
weighted avg 0.25 0.50 0.33 25000
D:\anaconda\lib\site-packages\sklearn\metrics\_classification.py:1318: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. D:\anaconda\lib\site-packages\sklearn\metrics\_classification.py:1318: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior. D:\anaconda\lib\site-packages\sklearn\metrics\_classification.py:1318: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
# save the model to disk
import pickle
filename = 'DTree_model.sav'
pickle.dump(dtree, open(filename, 'wb'))